{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "###将重采样样本进行合并\n",
    "import pandas as pd\n",
    "import glob\n",
    "path = 'E:/BaiduYunDownload/sougou2/TT/'\n",
    "\n",
    "whole_data = glob.glob(path+'*.csv')\n",
    "train_resam = pd.DataFrame()\n",
    "for filename in whole_data:\n",
    "    fil = pd.read_csv(filename,encoding = 'gb18030')\n",
    "    train_resam = train_resam.append(fil,ignore_index=True)\n",
    "train_resam = train_resam.rename(columns={'Unnamed: 0':'ID'})\n",
    "train_resam = train_resam[['ID','age','Gender','Education','Query']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "###数据处理\n",
    "import pandas as pd\n",
    "from sklearn.utils import shuffle\n",
    "import numpy as np\n",
    "from scipy.sparse import csr_matrix, hstack ,vstack\n",
    "np.random.seed(1)\n",
    "import jieba \n",
    "from collections import Counter\n",
    "import jieba.posseg as pseg\n",
    "import jieba.analyse\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.cross_validation import StratifiedKFold\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.feature_selection import SelectKBest, chi2\n",
    "from sklearn import svm\n",
    "from multiprocessing.dummy import Pool as ThreadPool\n",
    "from sklearn import preprocessing\n",
    "from bs4 import BeautifulSoup\n",
    "from sklearn.linear_model import RidgeClassifier,Lasso\n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn import cross_validation\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import accuracy_score,precision_score\n",
    "\n",
    "def mean_len(L):\n",
    "    num = len(L)-1 if len(L)>1 else 1\n",
    "    sum_len = sum(L)-max(L)\n",
    "    return round(float(sum_len)/num,1)\n",
    "\n",
    "train = pd.read_table('E:/BaiduYunDownload/sougou2/user_tag_query.10W.TRAIN',encoding = 'gb18030',header = None,sep = '\\n',names = ['col'])\n",
    "test = pd.read_table('E:/BaiduYunDownload/sougou2/user_tag_query.10W.TEST',encoding = 'gb18030',header = None,sep = '\\n',names = ['col'])\n",
    "train['ID'] = train['col'].apply(lambda x:x.split('\\t')[0])\n",
    "train['age'] = train['col'].apply(lambda x:x.split('\\t')[1])\n",
    "train['Gender'] = train['col'].apply(lambda x:x.split('\\t')[2])\n",
    "train['Education'] =  train['col'].apply(lambda x:x.split('\\t')[3])\n",
    "train['Query'] = train['col'].apply(lambda x:','.join(x.split('\\t')[4::]))\n",
    "train = pd.concat([train,train_resam])\n",
    "train = shuffle(train)\n",
    "####统计特征\n",
    "\n",
    "train['mean'] = train['Query'].apply(lambda x:mean_len([len(i) for i in x.split(',')]))\n",
    "train['max'] = train['Query'].apply(lambda x:max([len(i) for i in x.split(',')]))\n",
    "train['std'] = train['Query'].apply(lambda x:round(np.array(([len(i) for i in x.split(',')])).std(),1))\n",
    "train['sum'] = train['Query'].apply(lambda x:np.array(([len(i) for i in x.split(',')])).sum())\n",
    "\n",
    "train['age'] = train['age'].astype(int)\n",
    "train['Gender'] = train['Gender'].astype(int)\n",
    "train['Education'] = train['Education'].astype(int)\n",
    "\n",
    "test['ID'] = test['col'].apply(lambda x:x.split('\\t')[0])\n",
    "test['Query'] = test['col'].apply(lambda x:','.join(x.split('\\t')[1::]))\n",
    "####统计特征\n",
    "\n",
    "test['mean'] = test['Query'].apply(lambda x:mean_len([len(i) for i in x.split(',')]))\n",
    "test['max'] = test['Query'].apply(lambda x:max([len(i) for i in x.split(',')]))\n",
    "test['std'] = test['Query'].apply(lambda x:round(np.array(([len(i) for i in x.split(',')])).std(),1))\n",
    "test['sum'] = test['Query'].apply(lambda x:np.array(([len(i) for i in x.split(',')])).sum())\n",
    "\n",
    "train_data = train.drop('col',axis = 1)\n",
    "test_data = test.drop('col',axis = 1)\n",
    "test_data.insert(1,'age',0)\n",
    "test_data.insert(2,'Gender',0)\n",
    "test_data.insert(3,'Education',0)\n",
    "test_data['TT'] = 'test'\n",
    "train_data['TT'] = 'train'\n",
    "\n",
    "train = train_data.drop(list(train_data[(train_data['age']==0)|(train_data['Gender']==0)|(train_data['Education']==0)].index),axis = 0)\n",
    "###根据预测结果及样本分布，删除age为6，edu=1,2的所有数据，但删除后并无改善\n",
    "data = pd.concat([train,test_data])\n",
    "data = data.reset_index().drop('index',axis = 1)\n",
    "data = data.drop(data[data['age']==6].index,axis = 0)\n",
    "data = data.drop(data[data['Education']==1].index,axis = 0)\n",
    "data = data.drop(data[data['Education']==2].index,axis = 0)\n",
    "#data = data.drop(data[(data['age']==1)&(data['Education']==2)].index,axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(200685, 10)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache c:\\users\\admini~1\\appdata\\local\\temp\\jieba.cache\n",
      "Loading model cost 0.285 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "path = 'E:/BaiduYunDownload/sougou/Freq/'\n",
    "#jieba.load_userdict(\"E:/BaiduYunDownload/sougou/Freq/1.txt\")\n",
    "jieba.analyse.set_stop_words(path+'stop_wd.txt')\n",
    "def fenci(data):\n",
    "    list_fenci = []\n",
    "    for content in data.index:\n",
    "        feature_seg = jieba.cut(data['Query'][content])\n",
    "        seg_join = ','.join(feature_seg)\n",
    "        list_fenci.append(seg_join)\n",
    "    return list_fenci\n",
    "pool0 = ThreadPool(8)\n",
    "l_data = [data]\n",
    "fenci = pool0.map(fenci,l_data)\n",
    "data['jieba_fenci'] = fenci[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vectorizer1 = CountVectorizer(ngram_range= (1,2),min_df=18,max_df = 0.75,)\n",
    "vectorizer2 = CountVectorizer(ngram_range= (1,2),min_df=25,max_df = 0.6)\n",
    "vectorizer3 = CountVectorizer(ngram_range= (1,2),min_df=18,max_df = 0.85,)#(ngram_range= (1,2),min_df=10,max_df = 0.9,)\n",
    "X_vec1 = vectorizer1.fit_transform(data['jieba_fenci'].values)\n",
    "X_vec2 = vectorizer2.fit_transform(data['jieba_fenci'].values)\n",
    "X_vec3 = vectorizer3.fit_transform(data['jieba_fenci'].values)\n",
    "X1 = TfidfTransformer(use_idf = False,smooth_idf = True,sublinear_tf=0.6).fit_transform(X_vec1)\n",
    "X2 = TfidfTransformer(use_idf = True,smooth_idf = True,sublinear_tf=0.8).fit_transform(X_vec2)\n",
    "X3 = TfidfTransformer(use_idf = True,smooth_idf = True,sublinear_tf=0.6).fit_transform(X_vec3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda2\\lib\\site-packages\\scikit_learn-0.18.1-py2.7-win-amd64.egg\\sklearn\\preprocessing\\data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
      "  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\scikit_learn-0.18.1-py2.7-win-amd64.egg\\sklearn\\preprocessing\\data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
      "  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\scikit_learn-0.18.1-py2.7-win-amd64.egg\\sklearn\\utils\\validation.py:429: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.\n",
      "  warnings.warn(msg, _DataConversionWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\scikit_learn-0.18.1-py2.7-win-amd64.egg\\sklearn\\preprocessing\\data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
      "  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\scikit_learn-0.18.1-py2.7-win-amd64.egg\\sklearn\\preprocessing\\data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
      "  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\scikit_learn-0.18.1-py2.7-win-amd64.egg\\sklearn\\preprocessing\\data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
      "  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\scikit_learn-0.18.1-py2.7-win-amd64.egg\\sklearn\\preprocessing\\data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
      "  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\scikit_learn-0.18.1-py2.7-win-amd64.egg\\sklearn\\preprocessing\\data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
      "  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n",
      "D:\\Anaconda2\\lib\\site-packages\\scikit_learn-0.18.1-py2.7-win-amd64.egg\\sklearn\\preprocessing\\data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
      "  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "minmax = preprocessing.MinMaxScaler()\n",
    "mean1 = minmax.fit_transform(data.loc[:,'mean']) \n",
    "max1 = minmax.fit_transform(data.loc[:,'max']) \n",
    "std1 = minmax.fit_transform(data.loc[:,'std'])\n",
    "sum1 = minmax.fit_transform(data.loc[:,'sum'])\n",
    "\n",
    "cs = csr_matrix(np.hstack(( np.atleast_2d(mean1).T,np.atleast_2d(std1).T,np.atleast_2d(max1).T,np.atleast_2d(sum1).T\n",
    "                          )))\n",
    "X1 = hstack((X1, cs),format='csr')\n",
    "X2 = hstack((X2, cs),format='csr')\n",
    "X3 = hstack((X3, cs),format='csr')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<200685x459449 sparse matrix of type '<type 'numpy.float64'>'\n",
       "\twith 64075660 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_a = np.asarray(data['age'].values)\n",
    "y_g = np.asarray(data['Gender'].values)\n",
    "y_e = np.asarray(data['Education'].values)\n",
    "from sklearn.feature_selection import SelectKBest, chi2\n",
    "X_new1 = SelectKBest(chi2, k=X1.shape[1]/7).fit_transform(X1, y_a)\n",
    "X_new2 = SelectKBest(chi2, k=X2.shape[1]/6).fit_transform(X2, y_g)\n",
    "X_new3 = SelectKBest(chi2, k=X3.shape[1]/7).fit_transform(X3, y_e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Model Report\n",
      "precision_score : 0.6901\n"
     ]
    }
   ],
   "source": [
    "y1 = np.asarray(data[data['TT']=='train']['age'].values)\n",
    "y2 = np.asarray(data[data['TT']=='train']['Gender'].values)\n",
    "y3 = np.asarray(data[data['TT']=='train']['Education'].values)\n",
    "\n",
    "data_train1, data_test1, target_train1, target_test1 = cross_validation.train_test_split(X_new1[0:(data.shape[0]-100000)], y1)\n",
    "data_train2, data_test2, target_train2, target_test2 = cross_validation.train_test_split(X_new2[0:(data.shape[0]-100000)], y2)\n",
    "data_train3, data_test3, target_train3, target_test3 = cross_validation.train_test_split(X_new3[0:(data.shape[0]-100000)], y3)\n",
    "sgd1 = SGDClassifier(penalty = 'elasticnet',n_jobs=-1,random_state=1)\n",
    "sgd1.fit(data_train1,target_train1)\n",
    "predict1 = sgd1.predict(data_test1)\n",
    "lr2 = LogisticRegression(n_jobs = -1,random_state=1)\n",
    "lr2.fit(data_train2,target_train2)\n",
    "predict2 = lr2.predict(data_test2)\n",
    "sgd3 = SGDClassifier(penalty = 'elasticnet',n_jobs=-1,random_state=1)\n",
    "sgd3.fit(data_train3,target_train3)\n",
    "predict3 = sgd3.predict(data_test3)\n",
    "\n",
    "precision_score1 = precision_score(target_test1, predict1,average = 'micro')\n",
    "precision_score2 = precision_score(target_test2, predict2,average = 'micro')\n",
    "precision_score3 = precision_score(target_test3, predict3,average = 'micro')\n",
    "print \"\\nModel Report\"\n",
    "print \"precision_score : %.4g\" % ((precision_score1+precision_score2+precision_score3)/3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Time: 28765.55 seconds\n"
     ]
    }
   ],
   "source": [
    "###svm可以使复赛结果达到0.703+\n",
    "import time\n",
    "from multiprocessing.dummy import Pool as ThreadPool\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "start = time.time()\n",
    "def predict(X):\n",
    "    svm =SVC(kernel=\"linear\")\n",
    "    svm.fit(X[1][0:(data.shape[0]-100000)],X[0])\n",
    "    prediction = svm.predict(X[1][(data.shape[0]-100000):data.shape[0]])\n",
    "    return prediction\n",
    "x1 = [y1,X_new1]\n",
    "x2 = [y2,X_new2]\n",
    "x3 = [y3,X_new3]\n",
    "L1 = [x1,x2,x3]\n",
    "pool = ThreadPool(8)\n",
    "result = pool.map(predict,L1)\n",
    "pool.close()\n",
    "pool.join()\n",
    "print(\"Time: {:.2f} seconds\".format(time.time() - start))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "submission = pd.DataFrame()\n",
    "submission['ID'] = data[data['TT']=='test']['ID']\n",
    "submission['age'] = result[0]\n",
    "submission['Gender'] = result[1]\n",
    "submission['Education'] = result[2]\n",
    "submission.to_csv('E:/BaiduYunDownload/sougou2/12.10svm.csv',sep = ' ',index = False,header = None,encoding ='GBK')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "###stacking 非常耗时\n",
    "import random\n",
    "\n",
    "def run(data):\n",
    "    X = data[0]\n",
    "    Y = data[1]\n",
    "    \n",
    "    dev_cutoff = 105802\n",
    "    X_dev = X[:dev_cutoff]\n",
    "    Y_dev = Y[:dev_cutoff]\n",
    "    X_test = X[dev_cutoff:]\n",
    "    Y_test = Y[dev_cutoff:]\n",
    "\n",
    "    n_folds = 5\n",
    "    \n",
    "    clfs = [\n",
    "        SVC(kernel=\"linear\"),\n",
    "        svm.LinearSVC(),\n",
    "        SGDClassifier(loss = 'hinge'),\n",
    "        SGDClassifier(penalty = 'elasticnet'),\n",
    "        MultinomialNB(),\n",
    "        BernoulliNB(),\n",
    "        RidgeClassifier(),\n",
    "        \n",
    "    ]\n",
    "    \n",
    "    skf = list(StratifiedKFold(Y_dev, n_folds))\n",
    "    \n",
    "    blend_train = np.zeros((X_dev.shape[0], len(clfs))) \n",
    "    blend_test = np.zeros((X_test.shape[0], len(clfs))) \n",
    "    \n",
    "    print 'X_test.shape = %s' % (str(X_test.shape))\n",
    "    print 'blend_train.shape = %s' % (str(blend_train.shape))\n",
    "    print 'blend_test.shape = %s' % (str(blend_test.shape))\n",
    "    \n",
    "    for j, clf in enumerate(clfs):\n",
    "        print 'Training classifier [%s]' % (j)\n",
    "        blend_test_j = np.zeros((X_test.shape[0], len(skf))) \n",
    "        for i, (train_index, cv_index) in enumerate(skf):\n",
    "            print 'Fold [%s]' % (i)\n",
    "            \n",
    "            X_train = X_dev[train_index]\n",
    "            Y_train = Y_dev[train_index]\n",
    "            X_cv = X_dev[cv_index]\n",
    "            Y_cv = Y_dev[cv_index]\n",
    "            \n",
    "            clf.fit(X_train, Y_train)\n",
    "            \n",
    "            blend_train[cv_index, j] = clf.predict(X_cv)\n",
    "            blend_test_j[:, i] = clf.predict(X_test)\n",
    "        \n",
    "        blend_test[:, j] = blend_test_j.mean(1)\n",
    "    \n",
    "    print 'Y_dev.shape = %s' % (Y_dev.shape)\n",
    "    \n",
    "    bclf = LogisticRegression()\n",
    "    bclf.fit(blend_train, Y_dev)\n",
    "    \n",
    "    Y_test_predict = bclf.predict(blend_test)\n",
    "    \n",
    "    return Y_test_predict\n",
    "\n",
    "l1 = [X_new1,y1]\n",
    "l2 = [X_new2,y2]\n",
    "l3 = [X_new3,y3]\n",
    "list1 = [l1,l2,l3]\n",
    "pool = ThreadPool(8)\n",
    "stacking_result = pool.map(run,list1)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "submission1 = pd.DataFrame()\n",
    "submission1['ID'] = test['ID']\n",
    "submission1['age'] = stacking_result[0]\n",
    "submission1['Gender'] = stacking_result[1]\n",
    "submission1['Education'] = stacking_result[2]\n",
    "submission1.to_csv('E:/BaiduYunDownload/sougou2/12.02stacking.csv',sep = ' ',index = False,header = None,encoding ='GBK')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
